

***SET THE DIRECToRY to Replication_package
clear
global rootpath "D:\Replication_package"

********************************************************************************
//Here I'm assembling the NEI dataset
import delimited "$rootpath/raw_data/emis_sum_fac_15420.csv", clear

replace totalemissions=0 if totalemissions==.
gen ann_value=totalemissions if emissionsuom=="TON"
replace ann_value=totalemissions/2000 if emissionsuom=="LB"
rename eisfacilityid facility_id
rename pollutantcode poll
rename naicscode naics
rename sitename facility_name
rename postalabbreviation addr_state_cd
keep facility_id poll ann_value facility_name zipcode naics addr_state_cd fipscode
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
///Here I delete duplicates
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All fipscode
gen year=2017
order facility_id  facility_name year zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/NEI_2017_Alt.dta", replace


import delimited "$rootpath/raw_data/2014v2facilities.csv", clear
rename uom emissionsuom
rename total_emissions totalemissions
replace totalemissions=0 if totalemissions==.
gen ann_value=totalemissions if emissionsuom=="TON"
replace ann_value=totalemissions/2000 if emissionsuom=="LB"
rename eis_facility_site_id facility_id
rename pollutant_cd poll
rename naics_cd naics
rename facility_site_name facility_name
rename address_postal_code zipcode
rename state_and_county_fips_code fipscode
keep facility_id poll ann_value facility_name zipcode naics addr_state_cd fipscode
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
///Here I delete duplicates
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All fipscode
gen year=2014
order facility_id  facility_name year zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/NEI_2014_Alt.dta", replace


import delimited "$rootpath/raw_data/2011neiv2_facility.csv", clear
rename uom emissionsuom
rename total_emissions totalemissions
replace totalemissions=0 if totalemissions==.
gen ann_value=totalemissions if emissionsuom=="TON"
replace ann_value=totalemissions/2000 if emissionsuom=="LB"
rename eis_facility_site_id facility_id
rename pollutant_cd poll
rename naics_cd naics
rename facility_site_name facility_name
rename address_postal_code zipcode
rename state_and_county_fips_code fipscode
keep facility_id poll ann_value facility_name zipcode naics addr_state_cd fipscode
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)
replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
///Here I delete duplicates
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All fipscode
gen year=2011
order facility_id  facility_name year zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All
sort facility_id
save "$rootpath/processed_data/NEI_2011_Alt.dta", replace


import delimited "$rootpath/raw_data/2008neiv3_facility.csv", clear
rename uom emissionsuom
rename total_emissions totalemissions
replace totalemissions=0 if totalemissions==.
gen ann_value=totalemissions if emissionsuom=="TON"
replace ann_value=totalemissions/2000 if emissionsuom=="LB"
rename eis_facility_site_id facility_id
rename pollutant_cd poll
rename naics_cd naics
rename facility_site_name facility_name
rename address_postal_code zipcode
rename state_and_county_fips_code fipscode
keep facility_id poll ann_value facility_name zipcode naics addr_state_cd fipscode
//Here I work on lead because its an actual number variable
gen temp=0
replace temp=ann_value if poll=="7439921"
sort facility_id
egen Lead=sum(temp), by(facility_id)
drop temp
egen All=sum(ann_value), by(facility_id)

replace poll="PM25PRI" if poll=="PM25-PRI"
replace poll="PM10PRI" if poll=="PM10-PRI"
foreach var in NH3 PM25PRI VOC SO2 NOX CO PM10PRI {
gen temp=0
replace temp=ann_value if poll=="`var'"
egen `var'=sum(temp), by(facility_id)
drop temp
}

sort facility_id
///Here I delete duplicates
quietly by facility_id:  gen dup = cond(_N==1,0,_n)
drop if dup>1

keep facility_id  facility_name zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All fipscode
gen year=2008
order facility_id  facility_name year zipcode addr_state_cd naics NH3 PM25PRI VOC SO2 NOX CO PM10PRI Lead All 
sort facility_id
save "$rootpath/processed_data/NEI_2008_Alt.dta", replace

import delimited "$rootpath/raw_data/AFS_ACTIONS.csv", clear
//Drop observations prior to 2010
drop if date_achieved<20100000
drop if all_violation_type_codes==""
//General criteria: DIS, GC1-2,3,4,5,6,7,8,9, 10, M1a,b,c, M2a,b,c,, M3a,b,c,d,e,f, M4a,b,c,d,e,f 
keep if regexm(all_violation_type_codes, "G")==1|regexm(all_violation_type_codes, "M")==1

gen transfered_code = 0
replace transfered_code = 1 if regexm(all_violation_type_codes, "M")!=1&regexm(all_violation_type_codes, "GC4")!=1&regexm(all_violation_type_codes, "GC5")!=1&regexm(all_violation_type_codes, "GC6")!=1

save "$rootpath/processed_data/kept_actionsV21.dta", replace


clear
import delimited  "$rootpath/raw_data/AFS_FACILITIES.csv"
keep plant_id state state_number
sort plant_id
save "$rootpath/processed_data/plant_state_mapping.dta", replace

clear
use "$rootpath/processed_data/kept_actionsV21.dta"
merge m:1 plant_id using "$rootpath/processed_data/plant_state_mapping.dta"
drop if _merge!=3
egen avg_old_code_prob = mean(transfered_code), by(state_number)
replace avg_old_code_prob  = 1 - avg_old_code_prob 

sort state_number
quietly by state_number:  gen dup = cond(_N==1,0,_n) 
tab avg_old_code_prob if dup<2

drop if state=="PR"|state=="DC"|state=="VI"
drop if dup>1
sort avg_old_code_prob
gen rank = _n
keep state state_number avg_old_code_prob rank transfered_code
sort state
save "$rootpath/processed_data/state_resource_mappingV21.dta", replace


use "$rootpath/processed_data/NEI_2013.dta", clear
append using "$rootpath/processed_data/NEI_2016.dta"
destring facility_id, gen(facility_id2) force
drop facility_id 
rename facility_id2 facility_id
append using "$rootpath/processed_data/NEI_2008_Alt.dta"
append using "$rootpath/processed_data/NEI_2009.dta"
append using "$rootpath/processed_data/NEI_2010.dta"
append using "$rootpath/processed_data/NEI_2011_Alt.dta"
append using "$rootpath/processed_data/NEI_2012.dta"
append using "$rootpath/processed_data/NEI_2014_Alt.dta"
append using "$rootpath/processed_data/NEI_2015.dta"
append using "$rootpath/processed_data/NEI_2017_Alt.dta"
append using "$rootpath/processed_data/NEI_2018.dta"
append using "$rootpath/processed_data/NEI_2019.dta"
replace fipscode=0 if fipscode==.
drop if facility_id==.
sort facility_id
egen fips=max(fipscode), by(facility_id)
drop if fips==0
tabulate year, gen(y_)
sort facility_id year
quietly by  facility_id year:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup
xtset facility_id year
replace addr_state_cd=addr_state_cd[_n-1] if facility_id==facility_id[_n-1]&addr_state_cd[_n-1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n+1] if facility_id==facility_id[_n-1]&addr_state_cd[_n+1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n-1] if facility_id==facility_id[_n-1]&addr_state_cd[_n-1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n+1] if facility_id==facility_id[_n-1]&addr_state_cd[_n+1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n-1] if facility_id==facility_id[_n-1]&addr_state_cd[_n-1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n+1] if facility_id==facility_id[_n-1]&addr_state_cd[_n+1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n-1] if facility_id==facility_id[_n-1]&addr_state_cd[_n-1]!=""&addr_state_cd==""
replace addr_state_cd=addr_state_cd[_n+1] if facility_id==facility_id[_n-1]&addr_state_cd[_n+1]!=""&addr_state_cd==""
drop if addr_state_cd==""
rename addr_state_cd state
sort state
merge m:1 state using "$rootpath/processed_data/state_resource_mappingV21.dta"
drop if _merge!=3
drop _merge
sort fips
quietly by  fips:  gen dup_fips = cond(_N==1,0,_n)
sort fips
merge m:1 fips using "$rootpath/processed_data/MD_AP3_2014.dta"
foreach var in nh3 pm25 nox so2 voc {
gen temp_`var'=`var' if `var'!=.&dup_fips<2
sort state
egen avg_state_`var'=mean(temp_`var'), by(state)
drop avg_state_`var' temp_`var'
}

drop if _merge!=3
drop _merge

gen tot_damage = (nh3*NH3 + PM25PRI*pm25 + nox*NOX + so2*SO2 + VOC*voc)*1.47
gen log_damage=asinh(tot_damage)
gen log_emissions=asinh(NH3 + PM25PRI + NOX + SO2 + VOC)
gen log_emissions2=asinh((NH3 + PM25PRI + NOX + SO2 + VOC)*(1/100))

tabulate year, gen(yfe_)

gen post=0
replace post=1 if year>2014

gen treatXpost = post*avg_old_code_prob
save "$rootpath/processed_data/Temp_Reg_FileV4.dta", replace


///This is to include county GDP growth. 
clear
import delimited "$rootpath/raw_data/CAGDP1__ALL_AREAS_2001_2021.csv"

replace geofips = subinstr(geofips, char(34), "", .)
destring geofips, gen(fips_code) force
drop if fips_code==.
drop if fips_code<1000
keep if description=="Real GDP (thousands of chained 2012 dollars) "
foreach num of numlist 9/29 {
destring v`num', gen(Y`num') force

}
  
keep fips_code Y16 Y17 Y18 Y19 Y20 Y21 Y22 Y23 Y24 Y25 Y26 Y27
sort fips_code
save "$rootpath/processed_data/fips_GDP.dta", replace


//Here I count plants with HPVs in each state for the pre-policy change period.

import delimited "$rootpath/raw_data/AFS_ACTIONS.csv", clear
//Drop observations prior to 2010
drop if date_achieved<20100000
drop if all_violation_type_codes==""
//General criteria: DIS, GC1-2,3,4,5,6,7,8,9, 10, M1a,b,c, M2a,b,c,, M3a,b,c,d,e,f, M4a,b,c,d,e,f 
keep if regexm(all_violation_type_codes, "G")==1|regexm(all_violation_type_codes, "M")==1
sort plant_id
merge m:1 plant_id using "$rootpath/processed_data/plant_state_mapping.dta"
drop if _merge!=3
drop _merge
sort state
egen state_hpvs=nvals(plant_id), by(state)
sort state
quietly by  state:  gen dup_state = cond(_N==1,0,_n)
drop if dup_state>1
keep state state_number state_hpvs
rename state state_two_hpv
sort state_number
save "$rootpath/processed_data/state_hpv_count.dta", replace

///This is to get the delisted HPV violation count
import delimited "$rootpath/raw_data/AFS_ACTIONS.csv", clear
//Drop observations prior to 2010
drop if date_achieved<20100000
drop if all_violation_type_codes==""
//General criteria: DIS, GC1-2,3,4,5,6,7,8,9, 10, M1a,b,c, M2a,b,c,, M3a,b,c,d,e,f, M4a,b,c,d,e,f 
keep if regexm(all_violation_type_codes, "G")==1|regexm(all_violation_type_codes, "M")==1
sort plant_id
merge m:1 plant_id using "$rootpath/processed_data/plant_state_mapping.dta"
gen temp_plant_id=plant_id if _merge==3
sort state
egen hpv_plants_pre=nvals(temp_plant_id), by(state)
egen tot_hpv_plants=max(hpv_plants_pre), by(state)
drop if regexm(all_violation_type_codes, "M")!=1&regexm(all_violation_type_codes, "GC4")!=1&regexm(all_violation_type_codes, "GC5")!=1&regexm(all_violation_type_codes, "GC6")!=1

sort state
egen state_facils=nvals(plant_id), by(state)
drop if _merge!=3
drop _merge
sort state
egen state_hpvs=nvals(plant_id), by(state)
sort state
quietly by  state:  gen dup_state = cond(_N==1,0,_n)
drop if dup_state>1
keep state state_number state_hpvs state_facils tot_hpv_plants
rename state_hpvs state_delisted_hpvs
rename state state_two_hpv
sort state_number
save "$rootpath/processed_data/state_delisted_hpv_count.dta", replace


import delimited "$rootpath/raw_data/AFS_ACTIONS.csv", clear
//Drop observations prior to 2010
drop if date_achieved<20100000
drop if all_violation_type_codes==""
//General criteria: DIS, GC1-2,3,4,5,6,7,8,9, 10, M1a,b,c, M2a,b,c,, M3a,b,c,d,e,f, M4a,b,c,d,e,f 
keep if regexm(all_violation_type_codes, "G")==1|regexm(all_violation_type_codes, "M")==1

gen transfered_code = 0
replace transfered_code = 1 if regexm(all_violation_type_codes, "M4")!=1&regexm(all_violation_type_codes, "GC4")!=1&regexm(all_violation_type_codes, "GC5")!=1&regexm(all_violation_type_codes, "GC6")!=1&regexm(all_violation_type_codes, "M3A")!=1&regexm(all_violation_type_codes, "M3B")!=1&regexm(all_violation_type_codes, "M3C")!=1&regexm(all_violation_type_codes, "M3D")!=1&regexm(all_violation_type_codes, "M3E")!=1


merge m:1 plant_id using "$rootpath/processed_data/plant_state_mapping.dta"
drop if _merge!=3
egen avg_old_code_prob2 = mean(transfered_code), by(state_number)
replace avg_old_code_prob2  = 1 - avg_old_code_prob 

sort state_number
quietly by state_number:  gen dup = cond(_N==1,0,_n) 
tab avg_old_code_prob if dup<2

drop if state=="PR"|state=="DC"|state=="VI"
drop if dup>1
sort avg_old_code_prob
gen rank = _n
keep state state_number avg_old_code_prob2 
sort state
save "$rootpath/processed_data/state_resource_mapping_ALT.dta", replace


use "$rootpath/processed_data/Temp_Reg_FileV4.dta", clear
sort facility_id 
rename facility_id eis_id
merge m:1 eis_id using "$rootpath/processed_data/eis_id_to_frsnumber1.dta"
drop if _merge!=3
drop _merge
drop fips_code
rename eis_id facility_id
rename fips fips_code
sort fips_code
merge m:1 fips_code using "$rootpath/processed_data/fips_GDP.dta"
keep if _merge==3
drop _merge
gen multiplier=.
//Note Y16 is 2008. 
foreach num of numlist 16/27 {
gen year_temp=1992 + `num'
replace multiplier=Y`num'/Y22 if year==year_temp
drop year_temp
}

replace tot_damage=tot_damage*multiplier
replace log_damage=asinh(tot_damage)
xtset facility_id year


sort facility_id year
egen min_year=min(year), by(facility_id)
egen max_year=max(year), by(facility_id)
gen reg_nei=0
replace reg_nei=tot_damage if year==2011|year==2014|year==2017
egen max_reg_nei=max(reg_nei), by(facility_id)

gen off_nei=0
replace off_nei=tot_damage if year!=2011&year!=2014&year!=2017&year!=2008
egen max_off_nei=max(off_nei), by(facility_id)

gen zero=0
replace zero=1 if tot_damage==0
sort facility_id year
egen tot_zero=sum(zero), by(facility_id)

//gen log_damage2=asinh(tot_damage/100000)
gen log_damage3=asinh(tot_damage/1000)
gen log_damage2=asinh(tot_damage/1000000)
gen log_damage4=asinh(tot_damage/10000)
gen log_damage5=asinh(tot_damage/100000)
gen log_damage55=asinh(tot_damage/(1.47*100000))
gen log_damage22=asinh(tot_damage/(1.47*1000000))


gen vintage=0
replace vintage=1 if year<2011
replace vintage=2 if year>2010&year<2014
replace vintage=3 if year>2013&year<2017
replace vintage=4 if year>2016
egen max_vintage=max(vintage), by(facility_id)
egen min_vintage=min(vintage), by(facility_id)

sort facility_id year
gen log_emissions3=asinh((NH3 + PM25PRI + NOX + SO2 + VOC)*(1/1000))
gen log_emissions4=asinh((NH3 + PM25PRI + NOX + SO2 + VOC)*(1/10))
gen tot_emissions=(NH3 + PM25PRI + NOX + SO2 + VOC)
egen variety=nvals(tot_damage), by(facility_id)
egen variety_pol=nvals(log_emissions), by(facility_id)
egen max_damage=max(tot_damage), by(facility_id)
gen bad_obs=0
replace bad_obs=1 if tot_emissions>=10000
egen max_bad_obs=max(bad_obs), by(facility_id)

//WORKS NO 4 facility_name, abibow. Dave Johnston
gen bad_obs2=0
replace bad_obs2=1 if tot_emissions>=5000
egen max_bad_obs2=max(bad_obs2), by(facility_id)

gen bad_obs3=0
replace bad_obs3=1 if tot_damage>1e+09
egen max_bad_obs3=max(bad_obs3), by(facility_id)

gen n4=floor(naics/100)
gen n5=floor(naics/10)
gen lead_1=1
gen lead_2=0
replace lead_2=avg_old_code_prob if year==2013
gen lead_3=0
replace lead_3=avg_old_code_prob if year==2012
gen lead_4=0
replace lead_4=avg_old_code_prob if year==2011
gen lead_5=0
replace lead_5=avg_old_code_prob if year==2010
gen lag_1=0
replace lag_1=avg_old_code_prob if year==2015
gen lag_2=0
replace lag_2=avg_old_code_prob if year==2016
gen lag_3=0
replace lag_3=avg_old_code_prob if year==2017
gen lag_4=0
replace lag_4=avg_old_code_prob if year==2018
gen lag_5=0
replace lag_5=avg_old_code_prob if year==2019

set emptycells drop
set matsize 5000

reghdfe  log_emissions4 treatXpost if year>2009&max_damage>0&max_bad_obs2==0&min_year<2015&max_year>2014, absorb(facility_id n5#year) vce(cluster state_number)
reghdfe  log_damage55 treatXpost if year>2009&max_damage>0&max_bad_obs2==0&min_year<2015&max_year>2014, absorb(facility_id n5#year) vce(cluster state_number)

reghdfe  log_damage55 lead_5 lead_4 lead_3 lead_2 lag_* if year>2009&max_damage>0&max_bad_obs2==0&min_year<2015&max_year>2014, absorb(facility_id n5#year) vce(cluster state_number)

reghdfe  log_damage55 treatXpost if year>2009&max_damage>0&max_bad_obs2==0&min_year<2015&max_year>2014, absorb(facility_id n5#year) vce(cluster state_number)
drop if e(sample)!=1
save "$rootpath/processed_data/Temp_Reg_FileV5.dta", replace


use "$rootpath/processed_data/kept_actionsV21.dta", clear
sort afs_id
merge m:1 afs_id using "$rootpath/processed_data/FRS_AFS_LINK.dta"
keep if _merge==3
drop _merge
gen year=floor(date_achieved/10000)
tab year
sort frsnumber year
egen max_transfered_code=max(transfered_code), by(frsnumber year)

sort frsnumber year
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n) 
drop if dup>1
sort frsnumber year
keep frsnumber year max_transfered_code
save "$rootpath/processed_data/transfered_plant_level.dta", replace

import delimited "$rootpath/raw_data/ICIS-AIR_FACILITIES.csv", clear
rename registry_id frsnumber
drop if frsnumber==.
keep pgm_sys_id frsnumber
sort frsnumber
duplicates drop
save "$rootpath/processed_data/ICIS_to_FRS_temp.dta"

use "$rootpath/processed_data/ICIS_QUARTERLY_TOTAL_VIOLATIONS.dta", clear
sort pgm_sys_id
merge m:1 pgm_sys_id using "$rootpath/processed_data/ICIS_to_FRS_temp.dta"
drop if _merge!=3
drop _merge
gen year=floor(historical_compliance_date/100)+2000
sort frsnumber year
egen max_frv_year=max(tot_frv), by(frsnumber year)
egen max_hpv_year=max(tot_hpv), by(frsnumber year)
sort frsnumber year
quietly by frsnumber year:  gen dup = cond(_N==1,0,_n) 
drop if dup>1
keep frsnumber year max_frv_year max_hpv_year 
save "$rootpath/processed_data/transfered_plant_history.dta", replace
